library(edgeR)
library(ggplot2)
library(readr)
library(here)
# Create output directories if they don't exist
dir.create(file.path("edgeR_transcript_prot"), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path("edgeR_gene_prot"), showWarnings = FALSE, recursive = TRUE)
Transcripts
Load count data - Transcripts
counts <- read.csv("/Volumes/sheynkman/projects/LRP_Mohi_project/06_refine_orf_database/protein_counts_matrix.csv",
header = TRUE, row.names = 1)
group <- factor(c("Q157R", "Q157R", "Q157R", "WT", "WT", "WT"))
dge_raw <- DGEList(counts = counts, group = group)
Visualize raw counts
boxplot(dge_raw$counts, main = "Boxplot of Raw Counts", las = 2, col = as.numeric(group))
plotMDS(dge_raw, col = as.numeric(group), main = "MDS Plot for Raw Counts")
plotMD(dge_raw, col = as.numeric(group), main = "MD Plot for Raw Counts")
abline(h = 0, col = "red", lty = 2, lwd = 2)
MD (mean-difference) plots per sample
par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
for (i in 1:ncol(dge_raw$counts)) {
plotMD(dge_raw, column = i, main = paste("MD Plot for Sample", i))
abline(h = 0, col = "red", lty = 2, lwd = 2)
}
par(mfrow = c(1, 1)) # Reset plotting area to single panel
Filter lowly expressed transcripts (required for modeling)
keep <- filterByExpr(dge_raw)
table(keep)
## keep
## FALSE TRUE
## 144622 27450
dge <- dge_raw[keep, , keep.lib.sizes = FALSE]
Normalize Counts with TMM
dge <- calcNormFactors(dge, method = "TMM")
Save counts matrices
# Save dge_raw and dge_norm objects for downstream correlation analyses
save(dge_raw, file = file.path("edgeR_transcript_prot", "dte_raw.rda"))
save(dge, file = file.path("edgeR_transcript_prot", "dte_norm.rda"))
Visualize normalized counts
boxplot(dge$counts, main = "Boxplot of Normalized Counts")
log_cpm_tx <- cpm(dge, log = TRUE, prior.count = 1)
boxplot(log_cpm_tx,
main = "Boxplot of log2(CPM + 1) - Protein Transcripts",
las = 2,
col = as.numeric(group))
, las = 2, col = as.numeric(group)) plotMDS(dge, col =
as.numeric(group), main = “MDS Plot for Normalized Counts”) plotMD(dge,
col = as.numeric(group), main = “MD Plot for Normalized Counts”)
abline(h = 0, col = “red”, lty = 2, lwd = 2)
Design matrix and dispersion
``` r
design <- model.matrix(~ group)
rownames(design) <- colnames(dge)
dge_disp <- estimateDisp(dge, design)
dge_disp$common.dispersion
## [1] 0.06947344
plotBCV(dge_disp)
Fit model and test
fit <- glmQLFit(dge_disp, design)
plotQLDisp(fit)
result <- glmQLFTest(fit, coef = 2)
topTags(result)
## Coefficient: groupWT
## logFC logCPM F PValue FDR
## PB.21599.2 2.587828 4.864370 120.55023 4.157448e-07 0.01076015
## PB.17614.3 5.731934 1.879437 69.84691 9.345351e-07 0.01076015
## PB.9626.1 2.410092 4.466302 93.26450 1.439576e-06 0.01076015
## PB.18006.1 2.756940 4.316899 88.57754 1.842696e-06 0.01076015
## PB.20180.4 2.182769 4.491793 83.07070 2.496579e-06 0.01076015
## PB.16163.1 2.284374 5.637514 82.26567 2.612815e-06 0.01076015
## PB.11107.2 2.302415 4.344162 81.43147 2.743936e-06 0.01076015
## PB.11890.1 2.253413 4.974258 75.70609 3.863836e-06 0.01223914
## PB.2313.1 2.056590 7.686002 74.86852 4.068739e-06 0.01223914
## PB.24052.20 8.827075 2.250527 103.86255 4.468137e-06 0.01223914
FDR and DEG Summary
FDR <- p.adjust(result$table$PValue, method = "BH")
sum(FDR < 0.05)
## [1] 93
qlf <- glmQLFTest(fit)
top <- rownames(topTags(qlf))
cpm(dge_disp)[top, ]
## BioSample_1 BioSample_2 BioSample_3 BioSample_4 BioSample_5
## PB.21599.2 7.822002 7.576997 9.1346262 41.660344 59.259545
## PB.17614.3 0.000000 0.000000 0.3044875 5.252826 6.452706
## PB.9626.1 8.318638 6.908438 5.3285320 36.045254 33.053657
## PB.18006.1 4.966351 4.234204 5.7852633 30.611296 45.695693
## PB.20180.4 7.201209 8.468408 8.3734074 28.981109 39.506363
## PB.16163.1 17.382228 11.588348 21.1618841 68.467870 94.420208
## PB.11107.2 6.952891 4.902763 8.0689199 28.981109 36.477542
## PB.11890.1 11.670924 8.022703 12.3317454 41.116949 49.119578
## PB.2313.1 91.380854 62.175945 85.1042679 231.124345 344.758862
## PB.24052.20 0.000000 0.000000 0.0000000 5.977354 8.296336
## BioSample_6
## PB.21599.2 46.996000
## PB.17614.3 7.992517
## PB.9626.1 40.921687
## PB.18006.1 25.895755
## PB.20180.4 40.282286
## PB.16163.1 83.282028
## PB.11107.2 34.207973
## PB.11890.1 64.099987
## PB.2313.1 418.648043
## PB.24052.20 12.148626
summary(decideTests(qlf))
## groupWT
## Down 6
## NotSig 27357
## Up 87
plotMD(qlf)
abline(h = c(-1, 1), col = "blue")
Save DEG results & intermediate files
deg_results <- topTags(result, n = Inf)$table
write.table(deg_results,
file = file.path("edgeR_transcript_prot", "transcript_DEG_results.txt"),
sep = "\t", quote = FALSE, row.names = TRUE)
# Raw and normalized CPM Matrices
write.table(cpm(dge_raw),
file = file.path("edgeR_transcript_prot", "raw_CPM_matrix.txt"),
sep = "\t", quote = FALSE)
write.table(cpm(dge),
file = file.path("edgeR_transcript_prot", "normalized_CPM_matrix.txt"),
sep = "\t", quote = FALSE)
# List of filtered-in transcripts
write.table(rownames(dge),
file = file.path("edgeR_transcript_prot", "filtered_transcripts.txt"),
quote = FALSE, row.names = FALSE, col.names = FALSE)
# Design matrix to keep record of the model used
write.table(design,
file = file.path("edgeR_transcript_prot", "design_matrix.txt"),
sep = "\t", quote = FALSE, row.names = TRUE, col.names = NA)
# Top genes table
top_genes <- topTags(qlf, n = 50)$table
write.table(top_genes,
file = file.path("edgeR_transcript_prot", "top_transcripts.txt"),
sep = "\t", quote = FALSE, row.names = TRUE)
# Save dispersion plots (BCV and QL)
png(file.path("edgeR_transcript_prot", "BCV_plot.png"), width = 800, height = 600)
plotBCV(dge_disp)
dev.off()
## quartz_off_screen
## 2
png(file.path("edgeR_transcript_prot", "QLDisp_plot.png"), width = 800, height = 600)
plotQLDisp(fit)
dev.off()
## quartz_off_screen
## 2
# MDS plot (normalized)
png(file.path("edgeR_transcript_prot", "MDS_plot.png"), width = 800, height = 600)
plotMDS(dge, col = as.numeric(group), main = "MDS Plot for Normalized Counts")
dev.off()
## quartz_off_screen
## 2
Volcano Plot - left = downregulated in Q157R vs. WT (higher in WT); right = up-regulated in Q157R vs. WT (higher in Q157R)
deg_results$Gene <- rownames(deg_results)
ggplot(deg_results, aes(x = logFC, y = -log10(PValue))) +
geom_point(aes(color = FDR < 0.05)) +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")
ggsave(file.path("edgeR_transcript_prot", "transcript_volcano_plot.png"),
width = 6, height = 6, dpi = 300)
Interactive volcano plot
library(ggiraph)
library(plotly)
gg <- ggplot(deg_results, aes(x = logFC, y = -log10(PValue),
tooltip = Gene, data_id = Gene)) +
geom_point_interactive(aes(color = FDR < 0.05)) +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "Interactive Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")
girafe(ggobj = gg, width_svg = 10, height_svg = 6)
Save count matrices
# Transcript raw counts
write.table(dge_raw$counts,
file.path("edgeR_transcript_prot", "raw_counts_transcripts.txt"),
sep = "\t", quote = FALSE)
# Transcript normalized counts
write.table(cpm(dge),
file.path("edgeR_transcript_prot", "normalized_counts_transcripts.txt"),
sep = "\t", quote = FALSE)
Genes
counts_gene <- read.delim("/Volumes/sheynkman/projects/LRP_Mohi_project/19_LRP_summary/protein/raw_protein_gene_counts_matrix.txt",
header = TRUE, sep = "\t", stringsAsFactors = FALSE)
# Use 'gene_id' as rownames
rownames(counts_gene) <- counts_gene$gene_id
counts_gene$gene_id <- NULL
counts <- as.data.frame(counts_gene)
group <- factor(c("Q157R", "Q157R", "Q157R", "WT", "WT", "WT"))
dge_raw <- DGEList(counts = counts, group = group)
Visualize raw counts - Genes
boxplot(dge_raw$counts, main = "Boxplot of Raw Gene Counts", las = 2, col = as.numeric(group))
plotMDS(dge_raw, col = as.numeric(group), main = "MDS Plot for Raw Gene Counts")
plotMD(dge_raw, col = as.numeric(group), main = "MD Plot for Raw Gene Counts")
abline(h = 0, col = "red", lty = 2, lwd = 2)
MD plots per sample - Genes
par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
for (i in 1:ncol(dge_raw$counts)) {
plotMD(dge_raw, column = i, main = paste("MD Plot for Sample", i))
abline(h = 0, col = "red", lty = 2, lwd = 2)
}
par(mfrow = c(1, 1))
Filter lowly expressed genes
keep <- filterByExpr(dge_raw)
table(keep)
## keep
## FALSE TRUE
## 2539 9886
dge <- dge_raw[keep, , keep.lib.sizes = FALSE]
Normalize Counts with TMM - Genes
dge <- calcNormFactors(dge, method = "TMM")
Save counts matrices - Genes
# Save dge_raw and dge_norm objects for downstream correlation analyses
save(dge_raw, file = file.path("edgeR_gene_prot", "dge_raw.rda"))
save(dge, file = file.path("edgeR_gene_prot", "dge_norm.rda"))
Visualize normalized counts - Genes
boxplot(dge$counts, main = "Boxplot of Normalized Gene Counts")
## Log-transform and boxplot: Protein-level Gene CPM
log_cpm_gene <- cpm(dge, log = TRUE, prior.count = 1)
boxplot(log_cpm_gene,
main = "Boxplot of log2(CPM + 1) - Protein Genes",
las = 2,
col = as.numeric(group))
, las = 2, col = as.numeric(group)) plotMDS(dge, col =
as.numeric(group), main = “MDS Plot for Normalized Gene Counts”)
plotMD(dge, col = as.numeric(group), main = “MD Plot for Normalized Gene
Counts”) abline(h = 0, col = “red”, lty = 2, lwd = 2)
Design matrix and dispersion - Genes
``` r
design <- model.matrix(~ group)
rownames(design) <- colnames(dge)
dge_disp <- estimateDisp(dge, design)
dge_disp$common.dispersion
## [1] 0.05164303
plotBCV(dge_disp)
Fit model and test - Genes
fit <- glmQLFit(dge_disp, design)
plotQLDisp(fit)
result <- glmQLFTest(fit, coef = 2)
topTags(result)
## Coefficient: groupWT
## logFC logCPM F PValue FDR
## PB.11107 1.825258 4.524892 98.13461 6.479355e-06 0.02571292
## PB.3791 2.265128 3.983362 89.44119 9.304025e-06 0.02571292
## PB.16163 2.210740 5.610552 89.21069 9.376582e-06 0.02571292
## PB.11890 1.915878 5.132346 82.72761 1.254603e-05 0.02571292
## PB.21599 1.643707 5.148811 72.40887 2.087808e-05 0.02571292
## PB.18006 2.115223 4.537860 71.00810 2.250012e-05 0.02571292
## PB.2313 2.024386 7.653194 69.98660 2.374787e-05 0.02571292
## PB.10619 1.607534 4.731096 69.24039 2.474488e-05 0.02571292
## PB.7848 2.263690 3.813640 67.55444 2.722135e-05 0.02571292
## PB.14007 2.107100 5.351992 66.39023 2.899683e-05 0.02571292
FDR and DEG Summary - Genes
FDR <- p.adjust(result$table$PValue, method = "BH")
sum(FDR < 0.05)
## [1] 64
qlf <- glmQLFTest(fit)
top <- rownames(topTags(qlf))
cpm(dge_disp)[top, ]
## BioSample_1 BioSample_2 BioSample_3 BioSample_4 BioSample_5
## PB.11107 9.857822 9.445224 10.549382 31.42123 38.63298
## PB.3791 4.502956 4.393127 6.936580 23.47960 25.41306
## PB.16163 18.376928 11.861444 20.809741 66.46799 92.79617
## PB.11890 15.821196 12.959726 14.595721 44.36954 52.75135
## PB.21599 17.281614 15.595602 18.064011 44.36954 63.66100
## PB.18006 9.249315 8.127286 8.237189 32.62974 49.15759
## PB.2313 92.736548 63.261034 81.215794 222.53828 339.48254
## PB.10619 14.117375 11.641788 12.861576 35.04676 37.22115
## PB.7848 4.624657 4.612784 4.913411 17.43706 29.90527
## PB.14007 18.742032 11.202475 15.173769 49.37622 65.20118
## BioSample_6
## PB.11107 36.00306
## PB.3791 28.00238
## PB.16163 80.31452
## PB.11890 67.69806
## PB.21599 51.54284
## PB.18006 29.54097
## PB.2313 406.03452
## PB.10619 46.31163
## PB.7848 20.46328
## PB.14007 82.93013
summary(decideTests(qlf))
## groupWT
## Down 2
## NotSig 9822
## Up 62
plotMD(qlf)
abline(h = c(-1, 1), col = "blue")
Save DEG results & intermediate files - Genes
deg_results <- topTags(result, n = Inf)$table
write.table(deg_results,
file.path("edgeR_gene_prot", "gene_DEG_results.txt"),
sep = "\t", quote = FALSE, row.names = TRUE)
write.table(cpm(dge_raw),
file.path("edgeR_gene_prot", "raw_CPM_matrix_gene.txt"),
sep = "\t", quote = FALSE)
write.table(cpm(dge),
file.path("edgeR_gene_prot", "normalized_CPM_matrix_gene.txt"),
sep = "\t", quote = FALSE)
write.table(rownames(dge),
file.path("edgeR_gene_prot", "filtered_genes.txt"),
quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(design,
file.path("edgeR_gene_prot", "design_matrix_gene.txt"),
sep = "\t", quote = FALSE, row.names = TRUE, col.names = NA)
top_genes <- topTags(qlf, n = 50)$table
write.table(top_genes,
file.path("edgeR_gene_prot", "top_genes.txt"),
sep = "\t", quote = FALSE, row.names = TRUE)
png(file.path("edgeR_gene_prot", "BCV_plot_gene.png"), width = 800, height = 600)
plotBCV(dge_disp)
dev.off()
## quartz_off_screen
## 2
png(file.path("edgeR_gene_prot", "QLDisp_plot_gene.png"), width = 800, height = 600)
plotQLDisp(fit)
dev.off()
## quartz_off_screen
## 2
png(file.path("edgeR_gene_prot", "MDS_plot_gene.png"), width = 800, height = 600)
plotMDS(dge, col = as.numeric(group), main = "MDS Plot (Normalized Genes)")
dev.off()
## quartz_off_screen
## 2
Volcano Plot - Genes
deg_results$Gene <- rownames(deg_results)
ggplot(deg_results, aes(x = logFC, y = -log10(PValue))) +
geom_point(aes(color = FDR < 0.05)) +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "Gene Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")
ggsave(file.path("edgeR_gene_prot", "gene_volcano_plot.png"),
width = 6, height = 6, dpi = 300)
Interactive volcano plot - Genes
gg <- ggplot(deg_results, aes(x = logFC, y = -log10(PValue),
tooltip = Gene, data_id = Gene)) +
geom_point_interactive(aes(color = FDR < 0.05)) +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "Interactive Gene Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")
girafe(ggobj = gg, width_svg = 10, height_svg = 6)
Save count matrices
# Gene raw counts
write.table(dge_raw$counts,
file.path("edgeR_gene_prot", "raw_counts_genes.txt"),
sep = "\t", quote = FALSE)
# Gene normalized counts
write.table(cpm(dge),
file.path("edgeR_gene_prot", "normalized_counts_genes.txt"),
sep = "\t", quote = FALSE)